#!/usr/bin/env python
"""
This file was developed against Genbank annotation of E. coli
"""
import argparse
from BCBio import GFF
from Bio import SeqIO
from collections import Counter, OrderedDict


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('genbank', help='Genbank file')
    parser.add_argument('gff3', help='Output GFF3')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    lt_counts = Counter()
    records = list(SeqIO.parse(args.genbank, format='genbank'))
    for seqrecord in records:
        for feature in seqrecord.features:
            if 'locus_tag' not in feature.qualifiers:
                continue
            feature.qualifiers['gene_id'] = [feature.qualifiers['locus_tag'][0]]
            feature.qualifiers['gene_name'] = [feature.qualifiers.get('gene', feature.qualifiers['gene_id'])[0]]
            if feature.type in ['CDS', 'ncRNA', 'tRNA', 'rRNA']:
                feature.qualifiers['transcript_id'] = ['{}-{}'.format(feature.qualifiers['locus_tag'][0],
                                                                      lt_counts[feature.qualifiers['locus_tag'][0]])]
                feature.qualifiers['transcript_name'] = ['{}-{}-{}'.format(feature.qualifiers['gene_name'][0],
                                                                           feature.qualifiers['locus_tag'][0],
                                                                           lt_counts[feature.qualifiers['locus_tag'][0]])]
                if feature.type == 'CDS':
                    if 'pseudo' in feature.qualifiers:
                        del feature.qualifiers['pseudo']
                        biotype = ['pseudogene']
                    else:
                        biotype = ['protein_coding']
                else:
                    biotype = [feature.type]
                feature.qualifiers['gene_biotype'] = feature.qualifiers['transcript_biotype'] = biotype
                lt_counts[feature.qualifiers['locus_tag'][0]] += 1
            if 'translation' in feature.qualifiers:
                del feature.qualifiers['translation']
            if 'transl_table' in feature.qualifiers:
                del feature.qualifiers['transl_table']
            if 'codon_start' in feature.qualifiers:
                del feature.qualifiers['codon_start']
            # clean up and make parseable
            for key, val in feature.qualifiers.items():
                if sum(len(x) for x in val) == 0:
                    feature.qualifiers[key] = 'True'
            new_qualifiers = OrderedDict()
            for key, val in feature.qualifiers.items():
                # no upper case keys unless it is ID or Parent or Name
                if key not in ['ID', 'Parent', 'Name']:
                    key = key.lower()
                # collapse to a single item
                # replace all semicolons
                if len(val) > 1:
                    val = [' '.join([x.replace(';', '%3B') for x in val])]
                new_qualifiers[key] = val
            feature.qualifiers = new_qualifiers

    with open(args.gff3, 'w') as fh:
        GFF.write(records, fh)
